Note: Open scripts.Rproj first, then script. To easily use relative paths, click the down button next to knit and then click “Knit Directory –> Project Directory”. This should make loading and saving files much easier.
Run GO Term enrichment analysis using results from WGCNA analysis
(i.e., after running the 2a-WGCNA.Rmd script). Red text indicates regions that require the
user to modify. Regardless, the user should check over all code blocks
to ensure that everything is running correctly.
Load packages
library(goseq)
library(tidyverse)
library(GSEABase)
library(data.table)
library(ggplot2)
library(cowplot)
library(patchwork)
Change file names and conditions where appropriate.
# Input unfiltered data
treatmentinfo.file <- "salmon.numreads.WGCNA_samples.tsv"
wgcna_results.file <- "salmon.numreads.WGCNA_results.tsv"
annotation.file <- "Pocillopora_acuta_KBHIv2.pep.GOs.tsv"
# Output DEG results
GO_term_sig.file <- "salmon.numreads.WGCNA_results.GOsig.tsv"
KEGG_sig.file <- "salmon.numreads.WGCNA_results.KEGGsig.tsv"
# Cutoff for significant GO term p-values
GO_enrich_pvalue.cutoff <- 0.05
KEGG_enrich_pvalue.cutoff <- 0.05
# KEGG ID description file - provided with script
KEGG_IDs_Descriptions.file <- "2b-KEGG_IDs_Descriptions.tsv"
Load the input file containing the treatment information.
treatmentinfo <- read.csv(treatmentinfo.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM") # Read in file
head(treatmentinfo)
## sample_id unit lib_type fq1 fq2 species plug_id
## 1 Pacuta_ATAC_TP1_1043 1 pe SRR14610932 SRR14610932 Pacuta 1043
## 2 Pacuta_ATAC_TP1_1775 1 pe SRR14610931 SRR14610931 Pacuta 1775
## 3 Pacuta_ATAC_TP1_2363 1 pe SRR14610930 SRR14610930 Pacuta 2363
## 4 Pacuta_ATAC_TP3_1041 1 pe SRR14610929 SRR14610929 Pacuta 1041
## 5 Pacuta_ATAC_TP3_1471 1 pe SRR14610928 SRR14610928 Pacuta 1471
## 6 Pacuta_ATAC_TP3_1637 1 pe SRR14610927 SRR14610927 Pacuta 1637
## treatment timepoint reef ploidy group num
## 1 ATAC 1 Lilipuna.Fringe 3 Group4 1
## 2 ATAC 1 Reef.42.43 2 Group5 1
## 3 ATAC 1 Reef.18 3 Group3 1
## 4 ATAC 3 Reef.11.13 3 Group2 1
## 5 ATAC 3 Reef.35.36 3 Group3 1
## 6 ATAC 3 Reef.11.13 3 Group2 1
# Check we have the right column names
headers <- c("sample_id")
if( all(headers %in% colnames(treatmentinfo)) ){
print(paste(treatmentinfo.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(treatmentinfo.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.WGCNA_samples.tsv has the required columns: sample_id"
Load the input file containing WGCNA results.
wgcna_results <- as.data.frame(read.csv(wgcna_results.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM")) # Read in file
# Check we have the right column names
headers <- c("gene_id")
if( all(headers %in% colnames(wgcna_results)) ){
print(paste(wgcna_results.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(wgcna_results.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "salmon.numreads.WGCNA_results.tsv has the required columns: gene_id"
Filter wgcna_results to just the module that we want to
analyze. Adjust filtering as
required.
wgcna_results <- wgcna_results %>%
filter(moduleColor %in% c("darkred"))
head(wgcna_results)
## gene_id moduleColor GS.group
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g18206.t1 darkred -0.03207267
## 2 Pocillopora_acuta_KBHIv2___TS.g21588.t1 darkred -0.03402285
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g21467.t1a darkred 0.03428403
## 4 Pocillopora_acuta_KBHIv2___TS.g26006.t2 darkred 0.04265057
## 5 Pocillopora_acuta_KBHIv2___RNAseq.g24677.t1 darkred 0.04354234
## 6 Pocillopora_acuta_KBHIv2___RNAseq.21015_t darkred 0.05834933
## p.GS.group MM.green p.MM.green MM.yellow p.MM.yellow MM.magenta
## 1 0.7313859 -0.2954921 0.001218716723 0.2099302 0.023102245593 0.26224908
## 2 0.7157328 -0.2479159 0.007038059802 0.2297861 0.012692763678 0.09474403
## 3 0.7136453 -0.3544783 0.000088101563 0.2664554 0.003685433089 0.24342122
## 4 0.6479650 -0.4257462 0.000001706014 0.4192948 0.000002533992 0.11552494
## 5 0.6411104 -0.2887780 0.001590567087 0.1643817 0.076554432284 0.04444531
## 6 0.5320361 -0.3272705 0.000316173209 0.1528483 0.099920641962 -0.06917879
## p.MM.magenta MM.lightcyan p.MM.lightcyan MM.darkgrey p.MM.darkgrey
## 1 0.004285127 0.01901123 0.8387846 0.1525828 0.10051821477
## 2 0.309582686 0.06888993 0.4604929 0.2290520 0.01298816345
## 3 0.008177837 -0.01634581 0.8611422 0.2588572 0.00483076887
## 4 0.214851733 0.05536727 0.5532389 0.3843956 0.00001878854
## 5 0.634200409 0.03476737 0.7097875 0.2063496 0.02560647413
## 6 0.458610037 0.12976126 0.1631920 0.1402469 0.13150876798
## MM.brown p.MM.brown MM.darkgreen p.MM.darkgreen MM.royalblue
## 1 0.32342826 0.00037518863 0.4611408 0.0000001673496227 -0.11387554
## 2 0.19088254 0.03925173212 0.3879501 0.0000154775243793 -0.05407675
## 3 0.28967623 0.00153546715 0.5408257 0.0000000003080091 -0.20387922
## 4 0.38772901 0.00001566628 0.5420219 0.0000000002766267 -0.10426734
## 5 0.03016025 0.74684380557 0.3361713 0.0002108435316810 -0.14774859
## 6 -0.01177189 0.89975589405 0.3179398 0.0004772341356053 -0.03564825
## p.MM.royalblue MM.darkturquoise p.MM.darkturquoise MM.darkred
## 1 0.22151536 0.02196884 0.81412619 0.6962332
## 2 0.56253977 0.01032036 0.91206346 0.4729281
## 3 0.02746629 0.04126119 0.65870314 0.7333859
## 4 0.26323947 0.07163859 0.44275039 0.7420472
## 5 0.11189692 0.18356344 0.04758099 0.3859649
## 6 0.70277582 -0.15767693 0.08953358 0.3060560
## p.MM.darkred MM.midnightblue p.MM.midnightblue MM.grey60
## 1 0.000000000000000002920906258 0.14812762 0.110970022 0.36123754
## 2 0.000000072736116091717396760 0.22157673 0.016354363 0.19388954
## 3 0.000000000000000000005332528 0.23208925 0.011803784 0.28256291
## 4 0.000000000000000000001051349 0.29338354 0.001325931 0.32256203
## 5 0.000017252112707076900423872 0.28859276 0.001602152 -0.01420550
## 6 0.000791173268813867049120692 -0.06177483 0.508192283 -0.03379452
## p.MM.grey60 MM.lightgreen p.MM.lightgreen MM.purple p.MM.purple
## 1 0.00006296495 0.56538600 0.00000000003107586 -0.06524957 0.484583541264
## 2 0.03620305820 0.33446523 0.00022808740092415 -0.19925906 0.031254068495
## 3 0.00202373068 0.57298303 0.00000000001470413 -0.25680082 0.005191028459
## 4 0.00038982649 0.46727947 0.00000010885875478 -0.40954227 0.000004538237
## 5 0.87917594729 0.26026345 0.00459744180687308 -0.37312798 0.000034231090
## 6 0.71755954761 0.07702289 0.40913770282975298 -0.08028197 0.389541744824
## MM.greenyellow p.MM.greenyellow MM.lightyellow p.MM.lightyellow MM.pink
## 1 0.32903701 0.00029202818190 0.01837345 0.84412334870 0.6289653
## 2 0.26800680 0.00348401430944 -0.14569225 0.11703102359 0.4418650
## 3 0.36695766 0.00004710474873 -0.16886146 0.06875928239 0.6857097
## 4 0.48808936 0.00000002375842 -0.36703550 0.00004691732 0.6211864
## 5 0.09611212 0.30261664656232 -0.27502849 0.00269054602 0.3841575
## 6 -0.09098295 0.32926766901003 0.06146262 0.51034234633 0.2544099
## p.MM.pink MM.tan p.MM.tan MM.red p.MM.red
## 1 0.00000000000003110185686 0.29598188 0.00119497096 -0.17422162 0.06029145
## 2 0.00000061216834424629705 0.29569838 0.00120866320 -0.18702807 0.04347152
## 3 0.00000000000000001468793 0.34205200 0.00016023365 -0.18499196 0.04584913
## 4 0.00000000000007880754650 0.42720300 0.00000155844 -0.16376855 0.07767460
## 5 0.00001903264233562839897 0.13479527 0.14733721057 -0.05450836 0.55942087
## 6 0.00563986248076380012467 -0.01315947 0.88801274683 0.01873769 0.84107345
## MM.turquoise p.MM.turquoise MM.orange p.MM.orange MM.blue p.MM.blue
## 1 -0.102299695 0.2724045 -0.04878553 0.601432476 -0.03470709 0.71026826
## 2 -0.051650668 0.5802235 0.15083561 0.104520769 -0.11747855 0.20714509
## 3 -0.055545072 0.5519634 0.06179423 0.508058851 -0.12066546 0.19500173
## 4 -0.114997466 0.2169671 0.27593099 0.002601376 -0.07547774 0.41862697
## 5 0.004607088 0.9606814 0.01953099 0.834439136 -0.24360792 0.00812743
## 6 0.034040411 0.7155924 -0.14703976 0.113646491 -0.18516055 0.04564824
## MM.black p.MM.black MM.cyan p.MM.cyan MM.salmon p.MM.salmon
## 1 -0.26540413 0.003827795 -0.3693122 0.00004173412 0.22517828 0.014648281
## 2 -0.24758335 0.007117293 -0.2118528 0.02184644486 0.14315313 0.123620186
## 3 -0.26118846 0.004449504 -0.2861430 0.00176272684 0.17699429 0.056258016
## 4 -0.26149576 0.004401307 -0.1675263 0.07101213558 0.27799643 0.002407257
## 5 -0.14045907 0.130920079 -0.1076121 0.24814666541 -0.03144554 0.736443456
## 6 -0.09145676 0.326744653 -0.1883824 0.04194766139 -0.18839045 0.041938749
Load the input file containing gene annotations (GO terms, KEGG IDs, and gene/protein lengths).
# Annotations
annot <- read.csv(annotation.file, header=TRUE, sep="\t", fileEncoding="UTF-8-BOM")
# Check we have the right column names
headers <- c("gene_id", "GO_IDs", "KEGG_IDs", "length")
if( all(headers %in% colnames(annot)) ){
paste(paste(annotation.file, "has the required columns:", paste(headers, collapse=', ')))
} else {
stop(paste(annotation.file, "is missing required columns:", paste(headers, collapse=', ')))
}
## [1] "Pocillopora_acuta_KBHIv2.pep.GOs.tsv has the required columns: gene_id, GO_IDs, KEGG_IDs, length"
# Select only relevant information
Go.ref <- subset(annot, select= c(gene_id, length))
# Merge wgcna_results by available annotations
Go.ref <- merge(wgcna_results, Go.ref, by = "gene_id")
Go.ref <- unique(Go.ref)
dim(Go.ref)
## [1] 108 55
Set ID and gene length vectors, and make a binary matrix indicating which genes are differentially expressed. These are used as input to nullp, which for calculates a Probability Weighting Function for DEGs.
# Make ID and length vectors
IDvector <- annot$gene_id
lengthVector <- annot$length
# Get binary list indicating which genes are in WGCNA results and which are not out of all genes
wgcna.genes <- as.integer(annot$gene_id %in% Go.ref$gene_id)
names(wgcna.genes) <- annot$gene_id
print(paste("Number of WGCNA genes: ", length(wgcna.genes[wgcna.genes == 1]), sep=''))
## [1] "Number of WGCNA genes: 108"
print(paste("Number of NON WGCNA genes: ", length(wgcna.genes[wgcna.genes == 0]), sep=''))
## [1] "Number of NON WGCNA genes: 24804"
# Weight vector by length of gene
pwf <- nullp(DEgenes=wgcna.genes, id=IDvector, bias.data=lengthVector)
## Warning in pcls(G): initial point very close to some inequality constraints
Prepare GO term dataframe.
# Cleanup GO terms and split multiple GO terms per line, into one GO term per line
GO.annot <- annot %>%
subset(select=c(gene_id, GO_IDs)) %>%
drop_na() %>% # Remove rows with missing GO terms or gene IDs
filter(GO_IDs!="-") # Remove rows with missing values indicated by '-'
splitted <- strsplit(as.character(GO.annot$GO_IDs), "; ") # Split into multiple GO ids
GO.terms <- data.frame(v1 = rep.int(GO.annot$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their GO terms in a single row
colnames(GO.terms) <- c("gene_id", "GO.ID")
# Cleanup single-line GO term dataframe
GO.terms <- GO.terms %>%
mutate(GO.ID = gsub(" ", "", GO.ID)) %>% # Remove spaces from GO terms - in case any exist by mistake
mutate(GO.ID = as.character(GO.ID)) %>%
mutate(GO.ID = factor(GO.ID, levels=unique(GO.ID))) %>%
mutate(gene_id = factor(gene_id, levels=unique(gene_id))) %>%
unique()
# Print stats
print(paste("No rows in 'GO.terms': ", dim(GO.terms)[1], sep=''))
## [1] "No rows in 'GO.terms': 1655777"
print(paste("Avg GO IDs per gene: ", nrow(GO.terms) / length(unique(GO.terms$gene_id)), sep=''))
## [1] "Avg GO IDs per gene: 142.26110490592"
head(GO.terms)
## gene_id GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001101
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001932
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0001933
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002009
## 5 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002020
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b GO:0002165
Find enriched GO terms, “selection-unbiased testing for category enrichment amongst significantly expressed genes for RNA-seq data”.
GOall <- goseq(pwf, GOref$gene_id, gene2cat=GO.terms, test.cats=c("GO:CC", "GO:BP", "GO:MF"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
Find only enriched GO terms that are statistically significant at cutoff.
GOall.filtered <- GOall %>%
filter(over_represented_pvalue < GO_enrich_pvalue.cutoff | under_represented_pvalue < GO_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff,
"Over",
if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff,
"Under",
"NULL"
)
)
)
# Print stats
print(paste("Number GOs BEFORE sig filtering: ", nrow(GOall), sep=''))
## [1] "Number GOs BEFORE sig filtering: 21041"
print(paste("Number GOs AFTER sig filtering: ", nrow(GOall.filtered), sep=''))
## [1] "Number GOs AFTER sig filtering: 284"
print(paste("Number GOs AFTER sig filtering OVER: ", nrow(GOall.filtered %>% filter(over_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering OVER: 201"
print(paste("Number GOs AFTER sig filtering UNDER: ", nrow(GOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number GOs AFTER sig filtering UNDER: 83"
print(paste("Number sig BP terms: ", nrow(filter(GOall.filtered, ontology=="BP")), sep=''))
## [1] "Number sig BP terms: 192"
print(paste("Number sig MF terms: ", nrow(filter(GOall.filtered, ontology=="MF")), sep=''))
## [1] "Number sig MF terms: 43"
print(paste("Number sig CC terms: ", nrow(filter(GOall.filtered, ontology=="CC")), sep=''))
## [1] "Number sig CC terms: 36"
Correct any un-annotated terms/ontologies.
NAs.ontology <- GOall.filtered %>% subset(is.na(term))
NAs.ontology
## category over_represented_pvalue under_represented_pvalue numDEInCat
## 272 GO:0044462 0.0004026618 0.999997456 2
## 273 GO:0030898 0.0047769228 0.999854369 2
## 274 GO:0015307 0.0063233891 1.000000000 1
## 275 GO:0044257 0.0147909874 0.995684810 7
## 276 GO:0035042 0.0289434734 0.999684923 1
## 277 GO:0044265 0.0349708204 0.986850633 8
## 278 GO:0004003 0.0397530084 0.996128229 2
## 279 GO:0033613 0.0486189347 0.994659753 2
## 280 GO:0006928 0.9892080638 0.035336862 3
## 281 GO:0000904 0.9933563634 0.040830321 1
## 282 GO:0010941 0.9994985101 0.004458872 1
## 283 GO:0010942 1.0000000000 0.031163280 0
## 284 GO:0051270 1.0000000000 0.015034446 0
## numInCat term ontology dir
## 272 5 <NA> <NA> Over
## 273 20 <NA> <NA> Over
## 274 1 <NA> <NA> Over
## 275 546 <NA> <NA> Over
## 276 4 <NA> <NA> Over
## 277 817 <NA> <NA> Over
## 278 52 <NA> <NA> Over
## 279 84 <NA> <NA> Over
## 280 1571 <NA> <NA> Under
## 281 964 <NA> <NA> Under
## 282 1553 <NA> <NA> Under
## 283 720 <NA> <NA> Under
## 284 841 <NA> <NA> Under
Save significant terms.
write.table(GOall.filtered, file = GO_term_sig.file, row.names=F, quote=F, sep='\t')
Run GOslim to get broader categories.
slim <- getOBOCollection("http://current.geneontology.org/ontology/subsets/goslim_generic.obo") # Get GO database
## BP
BP_GO <- GOall.filtered %>%
filter(ontology=="BP")
BPGO_collection <- GOCollection(BP_GO$category) # Make library of query terms
slims_bp <- data.frame(goSlim(BPGO_collection, slim, "BP")) # Find common parent terms to slim down our list
slims_bp$category <- row.names(slims_bp) # Save rownames as category
## MF
MF_GO <- GOall.filtered %>%
filter(ontology=="MF")
MFGO_collection <- GOCollection(MF_GO$category) # Make library of query terms
slims_mf <- data.frame(goSlim(MFGO_collection, slim, "MF")) # Find common parent terms to slim down our list
slims_mf$category <- row.names(slims_mf) # Save rownames as category
Get mapped terms, using functions from Sam White’s Biostars post.
# Write function mappedIds to get the query terms that mapped to the slim categories
mappedIds <-function(df, collection, OFFSPRING) { # The command to run requires a dataframe of slim terms, like slims_MF above, your list of query terms, and the offspring from the GOCollection by goSlim
map <- as.list(OFFSPRING[rownames(df)]) # Subset GOcollection offspring by the rownames of your dataframe
mapped <- lapply(map, intersect, ids(collection)) # Find the terms that intersect between the subset made above of your query terms and the GOids from the GO collection
df[["go_terms"]] <- vapply(unname(mapped), paste, collapse = ";", character(1L)) # Add column "go_terms" with matching terms
df # Show resulting dataframe
}
# Run function for MF and BP terms
BPslim <- mappedIds(slims_bp, BPGO_collection, GOBPOFFSPRING)
MFslim <- mappedIds(slims_mf, MFGO_collection, GOMFOFFSPRING)
Remove duplicate matches, keeping the broader umbrella term
# BP
BPslim <- filter(BPslim, Count>0 & Term!="biological_process") # Filter out empty slims and term "biological process"
BPsplitted <- strsplit(as.character(BPslim$go_terms), ";") # Split into multiple GO ids
BPslimX <- data.frame(Term = rep.int(BPslim$Term, sapply(BPsplitted, length)), go_term = unlist(BPsplitted)) # List all
BPslimX <- merge(BPslimX, BPslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
BPslimX <- unique(setDT(BPslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only those that appear in the larger umbrella term (larger Count number)
BPslim <- data.frame(slim_term=BPslimX$Term, slim_cat=BPslimX$category, category=BPslimX$go_term) # Rename columns
head(BPslim)
## slim_term slim_cat category
## 1 cytoskeleton organization GO:0007010 GO:0000226
## 2 vesicle-mediated transport GO:0016192 GO:0000301
## 3 DNA repair GO:0006281 GO:0000731
## 4 anatomical structure development GO:0048856 GO:0000902
## 5 immune system process GO:0002376 GO:0002698
## 6 reproductive process GO:0022414 GO:0003006
# MF
MFslim <- filter(MFslim, Count>0 & Term!="molecular_function") # Filter out empty slims and term "molecular function"
MFsplitted <- strsplit(as.character(MFslim$go_terms), ";") # Split into multiple GO ids
MFslimX <- data.frame(Term = rep.int(MFslim$Term, sapply(MFsplitted, length)), go_term = unlist(MFsplitted)) # List all
MFslimX <- merge(MFslimX, MFslim[,c(1,3:4)], by="Term") # Add back counts, term, and category info
MFslimX <- unique(setDT(MFslimX)[order(go_term, -Count)], by = "go_term") # Remove duplicate offspring terms, keeping only
MFslim <- data.frame(slim_term=MFslimX$Term, slim_cat=MFslimX$category, category=MFslimX$go_term) # Rename columns
head(MFslim)
## slim_term slim_cat category
## 1 catalytic activity GO:0003824 GO:0004019
## 2 catalytic activity GO:0003824 GO:0004364
## 3 catalytic activity GO:0003824 GO:0004745
## 4 catalytic activity GO:0003824 GO:0004844
## 5 catalytic activity GO:0003824 GO:0008172
## 6 catalytic activity GO:0003824 GO:0008336
Add back GO enrichment info for each offspring term.
GO.BP <- right_join(BPslim, filter(GOall.filtered, ontology=="BP"), by="category")
GO.MF <- right_join(MFslim, filter(GOall.filtered, ontology=="MF"), by="category")
Plot heatmap of BP and MF GO slim terms.
term_label_text_size <- 6
slim_label_text_size <- 6
BPplot <- GO.BP %>%
filter(numInCat>5) %>%
mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
ggplot(aes(x = dir, y = term)) +
geom_tile(aes(fill=over_represented_pvalue, width = 1)) +
facet_grid(slim_term ~ ontology, scales = "free_y", labeller = label_wrap_gen(width = 10, multi_line = TRUE))+
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = slim_label_text_size, face = "bold"),
strip.text.x = element_text(size = 12, face = "bold"),
axis.title.x = element_blank(),
axis.title.y = element_text(size=15),
axis.text = element_text(size = term_label_text_size), legend.position = "None",
plot.margin = unit(c(0,1,0,0.25), "cm")
); BPplot
MFplot <- GO.MF %>%
filter(numInCat>5) %>%
mutate(term = fct_reorder(term, -over_represented_pvalue)) %>%
ggplot(aes(x = dir, y = term)) +
geom_tile(aes(fill=over_represented_pvalue, width = 1)) +
scale_y_discrete(position = "right") +
facet_grid(slim_term ~ ontology,
scales = "free_y",
labeller = label_wrap_gen(width = 10, multi_line = TRUE),
switch="y" # Put the y facet strips on the left
) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
strip.text.y.left = element_text(angle=0, size = slim_label_text_size, face = "bold"),
strip.text.x = element_text(size = 12, face = "bold"),
axis.title = element_blank(),
axis.text = element_text(size = term_label_text_size),
legend.title = element_text(size = 12),
legend.text = element_text(size = 11)
); MFplot
Combined BP and MF plots.
BPplot + MFplot + plot_annotation(tag_levels = "A", tag_suffix = ")") & theme(plot.tag = element_text(size=15, face="bold"))
Select KEGG Orthogroup IDs (KOs) from annotation dataframe.
# Extract gene_ids and KO from annotation file
KO.terms <- annot %>%
subset(select=c(gene_id, KEGG_IDs)) %>% # Select columns
drop_na() %>% # Remove rows with missing GO terms or gene IDs
filter(KEGG_IDs!="-") # Remove rows with missing values indicated by '-'
# Split multiple KEGG IDs per line into multiple lines
splitted <- strsplit(as.character(KO.terms$KEGG_IDs), "; ") # Split into multiple KEGG IDs
KO.terms <- data.frame(v1 = rep.int(KO.terms$gene_id, sapply(splitted, length)), v2 = unlist(splitted)) # List all genes with each of their KEGG IDs in a single row
colnames(KO.terms) <- c("gene_id", "KEGG_IDs")
KO.terms <- unique(KO.terms)
colnames(KO.terms) <- c("gene_id", "GO.ID")
head(KO.terms)
## gene_id GO.ID
## 1 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1a K00549
## 2 Pocillopora_acuta_KBHIv2___RNAseq.g24143.t1b K16866
## 3 Pocillopora_acuta_KBHIv2___RNAseq.g18333.t1 K03386
## 4 Pocillopora_acuta_KBHIv2___RNAseq.g7985.t1 K12418
## 5 Pocillopora_acuta_KBHIv2___TS.g15308.t1 K13755
## 6 Pocillopora_acuta_KBHIv2___RNAseq.g2057.t1 K11001
# Bind KO and GO references
GOKO.terms <- bind_rows(GO.terms, KO.terms)
Perform Kegg enrichment with goseq package.
KOall <- goseq(pwf, GOref$gene_id, gene2cat=GOKO.terms, test.cats=c("KEGG"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
Extract significantly enriched KEGG terms.
KOall.filtered <- KOall %>%
filter(over_represented_pvalue < KEGG_enrich_pvalue.cutoff | under_represented_pvalue < KEGG_enrich_pvalue.cutoff) %>% # Filter GO enrichment results by p-value
arrange(ontology, over_represented_pvalue, -numDEInCat) %>% # Reorder
mutate(term = factor(term, levels = unique(term))) %>% # Make 'terms' column a factor
mutate(dir = if_else(over_represented_pvalue < GO_enrich_pvalue.cutoff,
"Over",
if_else(under_represented_pvalue < GO_enrich_pvalue.cutoff,
"Under",
"NULL"
)
)
) %>%
mutate(ontology = replace_na(ontology, "KEGG")) %>%
filter(ontology=="KEGG")
# Print stats
print(paste("Number KEGG IDs BEFORE sig filtering: ", nrow(KOall), sep=''))
## [1] "Number KEGG IDs BEFORE sig filtering: 28185"
print(paste("Number KEGG IDs AFTER sig filtering: ", nrow(KOall.filtered), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering: 63"
print(paste("Number KEGG IDs AFTER sig filtering OVER: ", nrow(KOall.filtered %>% filter(over_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering OVER: 58"
print(paste("Number KEGG IDs AFTER sig filtering UNDER: ", nrow(KOall.filtered %>% filter(under_represented_pvalue < GO_enrich_pvalue.cutoff)), sep=''))
## [1] "Number KEGG IDs AFTER sig filtering UNDER: 5"
Add KO definitions.
# Load definition file
KEGG_IDs_Descriptions <- read.table(KEGG_IDs_Descriptions.file, header = TRUE, sep = "\t", fileEncoding="UTF-8-BOM", quote='') # Read in file
# Prep definition data
KEGG_IDs_Descriptions <- KEGG_IDs_Descriptions %>%
subset(select=c("D.ID", "D.Description")) %>%
unique()
colnames(KEGG_IDs_Descriptions) <- c("category", "description")
# Merge with KEGG output
KOall.filtered.descriptions <- unique(left_join(KOall.filtered, KEGG_IDs_Descriptions, by=c("category")))
Write output KEGG enrichment files.
write.table(KOall.filtered.descriptions, file = KEGG_sig.file, row.names=F, quote=F, sep='\t')